{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "mAGKZvMaBmWe",
    "outputId": "96aacb80-bb4c-4d0e-fce1-2862c223ce15"
   },
   "source": [
    "# How to use text descriptors in text-specific metrics?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "tmBHvzszAEN-"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn import datasets, ensemble, model_selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "-Xko20Q1FvZV"
   },
   "outputs": [],
   "source": [
    "from evidently import ColumnMapping\n",
    "from evidently.report import Report\n",
    "from evidently.test_suite import TestSuite\n",
    "\n",
    "from evidently.metric_preset import TextOverviewPreset, TextEvals\n",
    "\n",
    "from evidently.metrics import TextDescriptorsDriftMetric\n",
    "from evidently.metrics import TextDescriptorsDistribution\n",
    "from evidently.metrics import TextDescriptorsCorrelationMetric\n",
    "from evidently.metrics import ColumnDriftMetric, ColumnSummaryMetric\n",
    "\n",
    "from evidently.descriptors import TextLength, IncludesWords, OOV, NonLetterCharacterPercentage, SentenceCount, WordCount, Sentiment, RegExp, SemanticSimilarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "HmpgBJFqH7y-"
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('words')\n",
    "nltk.download('wordnet')\n",
    "nltk.download('omw-1.4')\n",
    "nltk.download('vader_lexicon')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Y_EV_iCeCiz1"
   },
   "outputs": [],
   "source": [
    "#Dataset for Data Quality and Integrity\n",
    "reviews_data = datasets.fetch_openml(name='Womens-E-Commerce-Clothing-Reviews', version=2, as_frame='auto')\n",
    "reviews = reviews_data.frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "YWyVOGsM2gV7"
   },
   "outputs": [],
   "source": [
    "reviews_ref = reviews[reviews.Rating > 3].sample(n=5000, replace=True, ignore_index=True, random_state=42)\n",
    "reviews_cur = reviews[reviews.Rating < 3].sample(n=5000, replace=True, ignore_index=True, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1lUgz2JlAheU"
   },
   "outputs": [],
   "source": [
    "reviews.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "60Wxndq7FzGT"
   },
   "outputs": [],
   "source": [
    "column_mapping = ColumnMapping(\n",
    "    numerical_features=['Age', 'Positive_Feedback_Count'],\n",
    "    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],\n",
    "    text_features=['Review_Text', 'Title']\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "D7YDxkF0qCrf"
   },
   "source": [
    "# Text Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "GM93q8XcqCrh"
   },
   "outputs": [],
   "source": [
    "#NO descriptors\n",
    "\n",
    "text_specific_metrics_report = Report(metrics=[\n",
    "    TextDescriptorsDriftMetric(column_name=\"Review_Text\"),\n",
    "    TextDescriptorsDistribution(column_name=\"Review_Text\"),\n",
    "    TextDescriptorsCorrelationMetric(column_name=\"Review_Text\"),\n",
    "])\n",
    "\n",
    "text_specific_metrics_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
    "text_specific_metrics_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "l2zzm1irqCri"
   },
   "outputs": [],
   "source": [
    "#WITH descriptors\n",
    "\n",
    "report = Report(metrics=[\n",
    "    TextDescriptorsDriftMetric(\"Review_Text\", descriptors={\n",
    "        \"Review Text Length\" : TextLength(),\n",
    "        \"Reviews about Dress\" : IncludesWords(words_list=['dress', 'gown']),\n",
    "        \"Review about Blouses\" : IncludesWords(words_list=['blouse', 'shirt']),\n",
    "        \"Review Sentence Count\" : SentenceCount(),\n",
    "        \"Review Word Count\" : WordCount(),\n",
    "        \"Review Sentiment\" : Sentiment(),\n",
    "        \"Review questions\": RegExp(reg_exp=r'.*\\?.*'),\n",
    "    }),\n",
    "    TextDescriptorsCorrelationMetric(column_name=\"Title\", descriptors={\n",
    "        \"Title OOV\" : OOV(),\n",
    "        \"Title Non Letter %\" : NonLetterCharacterPercentage(),\n",
    "        \"Title Length\" : TextLength(),\n",
    "        \"Title Sentence Count\" : SentenceCount(),\n",
    "        \"Title Word Count\" : WordCount(),\n",
    "        \"Title Sentiment\" : Sentiment(),\n",
    "        \"Title questions\": RegExp(reg_exp=r'.*\\?.*'),\n",
    "    })\n",
    "])\n",
    "\n",
    "report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
    "report\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IAe0g1SWlV4L"
   },
   "source": [
    "# Text Overview Preset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"Review_Text\")\n",
    "    ]\n",
    ")\n",
    "\n",
    "text_evals_report.run(reference_data=reviews_ref[:100], current_data=reviews_cur[:100], column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"Review_Text\", descriptors=[\n",
    "        SentenceCount(),\n",
    "        WordCount(),\n",
    "        Sentiment(),\n",
    "        IncludesWords(words_list=['blouse', 'shirt']),\n",
    "        ]\n",
    "    )\n",
    "])\n",
    "\n",
    "text_evals_report.run(reference_data=reviews_ref[:100], current_data=reviews_cur[:100], column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "msjgy3j-f-5i"
   },
   "outputs": [],
   "source": [
    "#NO descriptors\n",
    "text_overview_report = Report(metrics=[\n",
    "    TextOverviewPreset(column_name=\"Review_Text\")\n",
    "])\n",
    "\n",
    "text_overview_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
    "text_overview_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#NO descriptors, several columns\n",
    "\n",
    "text_overview_report = Report(metrics=[\n",
    "    TextOverviewPreset(columns=[\"Review_Text\", \"Title\"])\n",
    "])\n",
    "\n",
    "text_overview_report.run(reference_data=reviews_ref[:100], current_data=reviews_cur[:100], column_mapping=column_mapping)\n",
    "text_overview_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "9H8xHPN-tYY8"
   },
   "outputs": [],
   "source": [
    "#WITH descriptors\n",
    "text_overview_report = Report(metrics=[\n",
    "    TextOverviewPreset(column_name=\"Review_Text\", descriptors=[\n",
    "        OOV(),\n",
    "        NonLetterCharacterPercentage(),\n",
    "        TextLength(),\n",
    "        IncludesWords(words_list=['dress', 'gown']),\n",
    "        IncludesWords(words_list=['blouse', 'shirt']),\n",
    "        SentenceCount(),\n",
    "        WordCount(),\n",
    "        Sentiment(),\n",
    "        RegExp(reg_exp=r'.*\\?.*'),\n",
    "    ])\n",
    "])\n",
    "\n",
    "text_overview_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
    "text_overview_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_overview_report = Report(metrics=[\n",
    "    TextOverviewPreset(columns=[\"Review_Text\", \"Title\"], descriptors=[\n",
    "        OOV(),\n",
    "        NonLetterCharacterPercentage(),\n",
    "        TextLength(),\n",
    "        IncludesWords(words_list=['dress', 'gown']),\n",
    "        IncludesWords(words_list=['blouse', 'shirt']),\n",
    "        SentenceCount(),\n",
    "        WordCount(),\n",
    "        Sentiment(),\n",
    "        RegExp(reg_exp=r'.*\\?.*'),\n",
    "    ])\n",
    "])\n",
    "\n",
    "text_overview_report.run(reference_data=reviews_ref[:100], current_data=reviews_cur[:100], column_mapping=column_mapping)\n",
    "text_overview_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-9b5WTooqCrn"
   },
   "source": [
    "# Column Drift Metric for Text Descriptors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "82T9iuN2qCro"
   },
   "outputs": [],
   "source": [
    "drift_report = Report(metrics=[\n",
    "    ColumnDriftMetric(column_name=TextLength(display_name=\"TextLength\").for_column(\"Review_Text\")),\n",
    "    ColumnDriftMetric(column_name=IncludesWords(words_list=['blouse', 'shirt'], display_name=\"Review about Blouses\").for_column(\"Review_Text\")),\n",
    "\n",
    "])\n",
    "\n",
    "drift_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
    "drift_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "g0d1llGdcE6I"
   },
   "outputs": [],
   "source": [
    "summary_report = Report(metrics=[\n",
    "    ColumnSummaryMetric(column_name=SemanticSimilarity().on([\"Review_Text\", \"Title\"]))\n",
    "])\n",
    "\n",
    "summary_report.run(reference_data=reviews_ref[:10], current_data=reviews_cur[:10], column_mapping=column_mapping)\n",
    "summary_report"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
